# download_ijmil_issue.py
# IJMIL (International Journal of Media and Information Literacy)  Downloader
# Automates downloading PDFs from IJMIL (Cherkasgu Press platform)
# - Parses archive page for author/title blocks
# - Extracts correct /pdf.html links instead of broken direct URLs
# - Skips full-issue PDFs with no title/abstract
# - Renames PDFs based on article title only (excluding authors)
# - Creates dynamic folders using Vol/Issue/Year extracted from page metadata
# - Logs all downloads with article titles, URLs, and statuses into a CSV file

import os
import re
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# ---------- Helpers ----------
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

def extract_issue_info(html):
    """
    Look for 'International Journal of Media and Information Literacy, 2020, 5(1): ...'
    """
    match = re.search(r'International Journal of Media and Information Literacy.*?(\d{4}).*?(\d+)\((\d+)\)', html)
    if match:
        year, vol, issue = match.group(1), match.group(2), match.group(3)
        return f"IJMIL_Vol{vol}_Issue{issue}_{year}"
    # fallback to just year
    year_match = re.search(r'(\d{4})', html)
    year = year_match.group(1) if year_match else "Year"
    return f"IJMIL_{year}"

# ---------- Input ----------
archive_url = input("Enter IJMIL archive URL: ").strip()

# ---------- Fetch ----------
print(f"[INFO] Fetching: {archive_url}")
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(archive_url, headers=headers)
resp.raise_for_status()
html = resp.text
soup = BeautifulSoup(html, "html.parser")

# ---------- Folder ----------
folder_name = extract_issue_info(html)
os.makedirs(folder_name, exist_ok=True)

log_path = os.path.join(folder_name, f"{folder_name}_log.csv")
log_file = open(log_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(log_file)
csv_writer.writerow(["Title", "PDF URL", "Status"])

count = 0

# ---------- Regex parse numbered entries ----------
pattern = re.compile(
    r'\d+\.\s*<b>.*?</b><br><div style="margin-left:16px;">(.*?)<br>.*?(<b>Abstract:</b>.*?)?<a href="(/pdf\.html\?n=\d+\.pdf)"',
    re.S)

matches = pattern.findall(html)
print(f"[INFO] Found {len(matches)} potential articles")

for idx, (title, abstract_block, pdf_rel) in enumerate(matches, start=1):
    title = title.strip()

    # Skip full issue or empty titles
    if not title or title.lower().startswith("full number"):
        print(f"[SKIP] Full issue at #{idx}")
        csv_writer.writerow([title, "", "Skipped (Full Issue)"])
        continue

    # Skip no abstract (likely non-article)
    if not abstract_block or "<b>Abstract:</b>" not in abstract_block:
        print(f"[SKIP] No abstract for: {title}")
        csv_writer.writerow([title, "", "Skipped (No Abstract)"])
        continue

    pdf_url = urljoin(archive_url, pdf_rel)

    try:
        clean_title = sanitize_filename(title)
        pdf_path = os.path.join(folder_name, f"{clean_title}.pdf")
        print(f"[{count+1}] Downloading: {clean_title}")
        r = requests.get(pdf_url, headers=headers)
        r.raise_for_status()
        with open(pdf_path, "wb") as f:
            f.write(r.content)
        csv_writer.writerow([title, pdf_url, "OK"])
        count += 1
    except Exception as e:
        print(f"[ERROR] {title} - {e}")
        csv_writer.writerow([title, pdf_url, f"Error: {e}"])

log_file.close()
print(f"\nDone! {count} PDFs saved in {folder_name}")
print(f"Log file created: {log_path}")
